I. Introduction
d <- read.table("salary.txt",sep = ",", header = TRUE)
head(d)
## wage edu exp city reg race deg com emp
## 1 354.94 7 45 yes northeast white no 24.3 200
## 2 370.37 9 9 yes northeast white no 26.2 130
## 3 754.94 11 46 yes northeast white no 26.4 153
## 4 593.54 12 36 yes northeast other no 9.9 86
## 5 377.23 16 22 yes northeast white yes 7.1 181
## 6 284.90 8 51 yes northeast white no 11.4 32
df <- read.csv("salary.txt",header=T)
# exploratory data analysis
df$reg2 <- as.character(df$reg)
regions <- unique(df$reg2)
df$deg01[df$deg == "yes"] <- 1
df$deg01[df$deg == "no"] <- 4
#par(mfrow =c(2,2))
black <- df$race=="black"
white <- df$race=="white"
other <- df$race=="other"
df$color[black] = "red"
df$color[white] = "blue"
df$color[other] = "green"
for(r in regions){
df.sub <- df[df$reg2 == r,]
plot(df.sub$exp,
log(df.sub$wage),
#col= df$color,
pch=df.sub$deg01,
main = r)
#lines(supsmu(df$edu))
lines(supsmu(df$exp[black],log(df$wage)[black]),col=df$color[black] )
lines(supsmu(df$exp[white],log(df$wage)[white]),col=df$color[white] )
lines(supsmu(df$exp[other],log(df$wage)[other]),col=df$color[other] )
legend("topleft",legend=c("Black","White","Other"), col=c(2,4,3),lty=c(1,1,1))
legend("topright", legend=c("College degree", "No college degree"), pch=c(1,4))
}




#http://stackoverflow.com/questions/17551193/r-color-scatter-plot-points-based-on-values
# Pick trainging and test data
# Use sample
# Set seed to 0
set.seed(0)
index <- sample(1:nrow(df),4965,replace = F)
train.data <- df[-index,]
data <- train.data
test.data <- df[index,]
# Quality control check
sum(train.data$race=="black")/nrow(data)
## [1] 0.07830597
sum(test.data$race=="black")/nrow(test.data)
## [1] 0.07633434
# Rough model 1
summary(lm(wage~edu+exp+city+reg+race+deg+com,data=train.data))
##
## Call:
## lm(formula = wage ~ edu + exp + city + reg + race + deg + com,
## data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1106.0 -212.3 -51.0 141.5 18238.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -529.01624 21.76914 -24.301 < 2e-16 ***
## edu 58.25716 1.26061 46.214 < 2e-16 ***
## exp 10.77924 0.24169 44.599 < 2e-16 ***
## cityyes 103.89914 6.68588 15.540 < 2e-16 ***
## regnortheast 17.34384 8.45920 2.050 0.04035 *
## regsouth -29.67152 7.85718 -3.776 0.00016 ***
## regwest 13.20526 8.54327 1.546 0.12219
## raceother 131.61036 12.47726 10.548 < 2e-16 ***
## racewhite 132.91059 11.02090 12.060 < 2e-16 ***
## degyes 57.63865 9.48354 6.078 1.24e-09 ***
## com 0.01068 0.35925 0.030 0.97628
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 405.1 on 19847 degrees of freedom
## Multiple R-squared: 0.2125, Adjusted R-squared: 0.2121
## F-statistic: 535.6 on 10 and 19847 DF, p-value: < 2.2e-16
# Rough model 2
summary(lm(log(wage)~edu+exp+city+reg+race+deg+com,data=train.data))
##
## Call:
## lm(formula = log(wage) ~ edu + exp + city + reg + race + deg +
## com, data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7094 -0.3067 0.0381 0.3491 3.7425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3389570 0.0288166 150.572 < 2e-16 ***
## edu 0.0957897 0.0016687 57.403 < 2e-16 ***
## exp 0.0184726 0.0003199 57.738 < 2e-16 ***
## cityyes 0.1633313 0.0088503 18.455 < 2e-16 ***
## regnortheast 0.0330409 0.0111977 2.951 0.00317 **
## regsouth -0.0662718 0.0104008 -6.372 1.91e-10 ***
## regwest -0.0009764 0.0113090 -0.086 0.93120
## raceother 0.2340043 0.0165166 14.168 < 2e-16 ***
## racewhite 0.2397341 0.0145887 16.433 < 2e-16 ***
## degyes 0.0348181 0.0125537 2.774 0.00555 **
## com -0.0001071 0.0004755 -0.225 0.82177
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5363 on 19847 degrees of freedom
## Multiple R-squared: 0.2874, Adjusted R-squared: 0.287
## F-statistic: 800.5 on 10 and 19847 DF, p-value: < 2.2e-16
# EDA (Use smootehrs)
##### com
plot(data$com,log(data$wage))
lines(supsmu(data$com,log(data$wage)),col=2)

##### edu
plot(data$edu,log(data$wage))
lines(supsmu(data$edu,log(data$wage)),col=2)

##### edu
boxplot(log(data$wage)~data$edu)
lines(supsmu(data$edu,log(data$wage)),col=2)

##### exp
plot(data$exp,log(data$wage))
lines(supsmu(data$exp,log(data$wage)),col=2)

##### emp
plot(data$emp,log(data$wage))
lines(supsmu(data$emp,log(data$wage)),col=2)

#### Interaction plots
# City vs. region (yes)
city <- data$city
reg <- data$reg
wage <- data$wage
interaction.plot(city,reg,log(wage))

# race vs. city (maybe not)
race <- data$race
interaction.plot(race,reg,log(wage))

# race vs. city (maybe not)
interaction.plot(race,city,log(wage))

# edu vs.degree (the lines do cross)
plot(data$edu,log(wage),col=data$deg)
degree <- data$deg=="yes"
abline(lm(log(wage)[degree]~data$edu[degree]),col="red")
abline(lm(log(wage)[-degree]~data$edu[-degree]),col="black")

# exp vs. race (the lines do cross)
plot(data$exp,log(wage),col=data$race)

plot(data$exp,log(wage))
black <- data$race=="black"
white <- data$race=="white"
other <- data$race=="other"
lines(supsmu(data$exp[black],log(data$wage)[black]),col=2)
lines(supsmu(data$exp[white],log(data$wage)[white]),col=3)
lines(supsmu(data$exp[other],log(data$wage)[other]),col=4)
legend("topright",legend=c("Black","White","Other"),col=c(2,3,4),lty=c(1,1,1))

# Defining indicators (dummy variables)
black <- I(data$race=="black")
white <- I(data$race=="white")
other <- I(data$race=="other")
dfram <- data.frame(Wage = d$wage, Edu = d$edu, Exp = d$exp, City = d$city, Region = d$reg, Race = d$race, College = d$deg, Commute = d$com, Employees = d$emp)
head(dfram)
## Wage Edu Exp City Region Race College Commute Employees
## 1 354.94 7 45 yes northeast white no 24.3 200
## 2 370.37 9 9 yes northeast white no 26.2 130
## 3 754.94 11 46 yes northeast white no 26.4 153
## 4 593.54 12 36 yes northeast other no 9.9 86
## 5 377.23 16 22 yes northeast white yes 7.1 181
## 6 284.90 8 51 yes northeast white no 11.4 32
black <- subset(dfram, Race == "black")
white <- subset(dfram, Race == "white")
nonblack <- subset(dfram, Race != "black")
race <- c(rep("black", nrow(black)), rep("white", nrow(white)))
wage <- c(black$Wage, white$Wage)
# remove outliers
boxplot(wage~race, main="boxplot", xlab="Race", ylab="Wage", outline=FALSE)

# log wage
boxplot(log(wage)~race, main="boxplot", xlab="Race", ylab="Wage")

#model.1 <- lm(wage~edu, data=d)
#plot(d$edu, d$wage)
#abline(-32.2755,51.5334)
#summary(model.1)
n <- length(d)
n
## [1] 9